{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import torch\n",
    "\n",
    "from transformers import BertTokenizer\n",
    "from transformers import BertForSequenceClassification, AdamW, BertConfig\n",
    "from transformers import get_linear_schedule_with_warmup\n",
    "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import random\n",
    "import time\n",
    "import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_excel('/content/df.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df['text']\n",
    "y = df['label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.DataFrame({'text': X_train, 'label': y_train})\n",
    "test = pd.DataFrame({'text': X_test, 'label': y_test})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = train['text']\n",
    "sentences[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# BERT Structure\n",
    "sentences = [\"[CLS] \" + str(sentence) + \" [SEP]\" for sentence in sentences]\n",
    "sentences[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract Label\n",
    "labels = train['label'].values\n",
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)\n",
    "tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]\n",
    "\n",
    "print (sentences[0])\n",
    "print (tokenized_texts[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Max sequence\n",
    "MAX_LEN = 128\n",
    "\n",
    "# Token to index\n",
    "input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]\n",
    "\n",
    "# Max_len, add 0 for nan\n",
    "input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype=\"long\", truncating=\"post\", padding=\"post\")\n",
    "\n",
    "input_ids[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Attention\n",
    "attention_masks = []\n",
    "\n",
    "#  if not padding==1, otherwise == 0\n",
    "for seq in input_ids:\n",
    "    seq_mask = [float(i>0) for i in seq]\n",
    "    attention_masks.append(seq_mask)\n",
    "\n",
    "print(attention_masks[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train set and Validation set\n",
    "train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,\n",
    "                                                                                    labels,\n",
    "                                                                                    random_state=2018,\n",
    "                                                                                    test_size=0.1)\n",
    "\n",
    "# Classify attention mask to train and test\n",
    "train_masks, validation_masks, _, _ = train_test_split(attention_masks,\n",
    "                                                       input_ids,\n",
    "                                                       random_state=2018,\n",
    "                                                       test_size=0.1)\n",
    "\n",
    "# data to tensor\n",
    "train_inputs = torch.tensor(train_inputs)\n",
    "train_labels = torch.tensor(train_labels)\n",
    "train_masks = torch.tensor(train_masks)\n",
    "validation_inputs = torch.tensor(validation_inputs)\n",
    "validation_labels = torch.tensor(validation_labels)\n",
    "validation_masks = torch.tensor(validation_masks)\n",
    "\n",
    "print(train_inputs[0])\n",
    "print(train_labels[0])\n",
    "print(train_masks[0])\n",
    "print(validation_inputs[0])\n",
    "print(validation_labels[0])\n",
    "print(validation_masks[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# batch size\n",
    "batch_size = 32\n",
    "\n",
    "# Insert Dataloader from Pytorch, mask and lavel\n",
    "train_data = TensorDataset(train_inputs, train_masks, train_labels)\n",
    "train_sampler = RandomSampler(train_data)\n",
    "train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)\n",
    "\n",
    "validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)\n",
    "validation_sampler = SequentialSampler(validation_data)\n",
    "validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = test['text']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = [\"[CLS] \" + str(sentence) + \" [SEP]\" for sentence in sentences]\n",
    "sentences[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = test['label'].values\n",
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)\n",
    "tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]\n",
    "\n",
    "print (sentences[0])\n",
    "print (tokenized_texts[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_LEN = 128\n",
    "\n",
    "input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]\n",
    "\n",
    "input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype=\"long\", truncating=\"post\", padding=\"post\")\n",
    "\n",
    "input_ids[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "attention_masks = []\n",
    "\n",
    "for seq in input_ids:\n",
    "    seq_mask = [float(i>0) for i in seq]\n",
    "    attention_masks.append(seq_mask)\n",
    "\n",
    "print(attention_masks[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_inputs = torch.tensor(input_ids)\n",
    "test_labels = torch.tensor(labels)\n",
    "test_masks = torch.tensor(attention_masks)\n",
    "\n",
    "print(test_inputs[0])\n",
    "print(test_labels[0])\n",
    "print(test_masks[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 32\n",
    "\n",
    "\n",
    "test_data = TensorDataset(test_inputs, test_masks, test_labels)\n",
    "test_sampler = RandomSampler(test_data)\n",
    "test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# GPU device\n",
    "device_name = tf.test.gpu_device_name()\n",
    "\n",
    "# GPU test\n",
    "if device_name == '/device:GPU:0':\n",
    "    print('Found GPU at: {}'.format(device_name))\n",
    "else:\n",
    "    raise SystemError('GPU device not found')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Device confirm\n",
    "if torch.cuda.is_available():\n",
    "    device = torch.device(\"cuda\")\n",
    "    print('There are %d GPU(s) available.' % torch.cuda.device_count())\n",
    "    print('We will use the GPU:', torch.cuda.get_device_name(0))\n",
    "else:\n",
    "    device = torch.device(\"cpu\")\n",
    "    print('No GPU available, using the CPU instead.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# BERT for Classification\n",
    "model = BertForSequenceClassification.from_pretrained(\"bert-base-multilingual-cased\", num_labels=2)\n",
    "model.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optimizer\n",
    "optimizer = AdamW(model.parameters(),\n",
    "                  lr = 2e-5, # 학습률\n",
    "                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값\n",
    "                )\n",
    "\n",
    "# Epochs\n",
    "epochs = 3\n",
    "\n",
    "# Total step : batch * epoch\n",
    "total_steps = len(train_dataloader) * epochs\n",
    "\n",
    "# Scheduler\n",
    "scheduler = get_linear_schedule_with_warmup(optimizer,\n",
    "                                            num_warmup_steps = 0,\n",
    "                                            num_training_steps = total_steps)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Learning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Accuracy\n",
    "def flat_accuracy(preds, labels):\n",
    "\n",
    "    pred_flat = np.argmax(preds, axis=1).flatten()\n",
    "    labels_flat = labels.flatten()\n",
    "\n",
    "    return np.sum(pred_flat == labels_flat) / len(labels_flat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Time\n",
    "def format_time(elapsed):\n",
    "\n",
    "    # round\n",
    "    elapsed_rounded = int(round((elapsed)))\n",
    "\n",
    "    # hh:mm:ss\n",
    "    return str(datetime.timedelta(seconds=elapsed_rounded))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Random seed\n",
    "seed_val = 42\n",
    "random.seed(seed_val)\n",
    "np.random.seed(seed_val)\n",
    "torch.manual_seed(seed_val)\n",
    "torch.cuda.manual_seed_all(seed_val)\n",
    "\n",
    "# gradient zero\n",
    "model.zero_grad()\n",
    "\n",
    "# Iterate by epochs\n",
    "for epoch_i in range(0, epochs):\n",
    "\n",
    "    # ========================================\n",
    "    #               Training\n",
    "    # ========================================\n",
    "\n",
    "    print(\"\")\n",
    "    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n",
    "    print('Training...')\n",
    "\n",
    "    t0 = time.time()\n",
    "\n",
    "    total_loss = 0\n",
    "\n",
    "    model.train()\n",
    "\n",
    "    for step, batch in enumerate(train_dataloader):\n",
    "        # 경과 정보 표시\n",
    "        if step % 500 == 0 and not step == 0:\n",
    "            elapsed = format_time(time.time() - t0)\n",
    "            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n",
    "\n",
    "        batch = tuple(t.to(device) for t in batch)\n",
    "\n",
    "        b_input_ids, b_input_mask, b_labels = batch\n",
    "\n",
    "        outputs = model(b_input_ids,\n",
    "                        token_type_ids=None,\n",
    "                        attention_mask=b_input_mask,\n",
    "                        labels=b_labels)\n",
    "\n",
    "       # Loss\n",
    "        loss = outputs[0]\n",
    "\n",
    "        # Total Loss\n",
    "        total_loss += loss.item()\n",
    "\n",
    "        # Grandient using Backward\n",
    "        loss.backward()\n",
    "\n",
    "        # Gradient climping\n",
    "        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n",
    "\n",
    "        # parameter update via gradient\n",
    "        optimizer.step()\n",
    "\n",
    "        # Scheduler\n",
    "        scheduler.step()\n",
    "\n",
    "        # Gradient zero\n",
    "        model.zero_grad()\n",
    "\n",
    "    # Average loss\n",
    "    avg_train_loss = total_loss / len(train_dataloader)\n",
    "\n",
    "    print(\"\")\n",
    "    print(\"  Average training loss: {0:.2f}\".format(avg_train_loss))\n",
    "    print(\"  Training epcoh took: {:}\".format(format_time(time.time() - t0)))\n",
    "\n",
    "    # ========================================\n",
    "    #               Validation\n",
    "    # ========================================\n",
    "\n",
    "    print(\"\")\n",
    "    print(\"Running Validation...\")\n",
    "\n",
    "    #Time\n",
    "    t0 = time.time()\n",
    "\n",
    "    # Evaluation model\n",
    "    model.eval()\n",
    "\n",
    "    eval_loss, eval_accuracy = 0, 0\n",
    "    nb_eval_steps, nb_eval_examples = 0, 0\n",
    "\n",
    "    # import from dataloader by batch\n",
    "    for batch in validation_dataloader:\n",
    "        # Batch to GPU\n",
    "        batch = tuple(t.to(device) for t in batch)\n",
    "\n",
    "        # Extract data from batch\n",
    "        b_input_ids, b_input_mask, b_labels = batch\n",
    "\n",
    "        # no gradient\n",
    "        with torch.no_grad():\n",
    "            # Forward 수행\n",
    "            outputs = model(b_input_ids,\n",
    "                            token_type_ids=None,\n",
    "                            attention_mask=b_input_mask)\n",
    "\n",
    "        # Loss\n",
    "        logits = outputs[0]\n",
    "\n",
    "        # Data to CPU\n",
    "        logits = logits.detach().cpu().numpy()\n",
    "        label_ids = b_labels.to('cpu').numpy()\n",
    "\n",
    "        # Accuracy\n",
    "        tmp_eval_accuracy = flat_accuracy(logits, label_ids)\n",
    "        eval_accuracy += tmp_eval_accuracy\n",
    "        nb_eval_steps += 1\n",
    "\n",
    "    print(\"  Accuracy: {0:.2f}\".format(eval_accuracy/nb_eval_steps))\n",
    "    print(\"  Validation took: {:}\".format(format_time(time.time() - t0)))\n",
    "\n",
    "print(\"\")\n",
    "print(\"Training complete!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "`"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
